Purpose:
Calculate and plot mutational signatures for all samples using COSMIC signatures and Alexandrov et al, 2013 mutational signatures.
To run this from the command line, use:
Rscript -e "rmarkdown::render('analyses/mutational-signatures/01-known_signatures.Rmd', clean = TRUE)"
This assumes you are in the top directory of the repository.
Import necessary functions.
# Magrittr pipe
`%>%` <- dplyr::`%>%`
# Import specialized functions
source(file.path("util", "mut_sig_functions.R"))
# Load this library
library(deconstructSigs)
library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.1 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.2 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Set up directory paths.
data_dir <- file.path("..", "..", "data")
input_dir <- "input"
results_dir <- "results"
plots_dir <- "plots"
figures_dir <- file.path("..", "..", "figures")
cosmicv2_plots <- file.path(plots_dir, "cosmicv2")
nature_plots <- file.path(plots_dir, "nature")
cosmicv3_plots <- file.path(plots_dir, "cosmicv3")
scratch_dir <- file.path("..", "..", "scratch", "mutational-signatures")
cosmicv2_scratch <- file.path(scratch_dir, "cosmicv2")
cosmicv3_scratch <- file.path(scratch_dir, "cosmicv3")
nature_scratch <- file.path(scratch_dir, "nature")
Make new directories for the results.
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
if (!dir.exists(cosmicv2_plots)) {
dir.create(cosmicv2_plots, recursive = TRUE)
}
if (!dir.exists(nature_plots)) {
dir.create(nature_plots, recursive = TRUE)
}
if (!dir.exists(cosmicv3_plots)) {
dir.create(cosmicv3_plots, recursive = TRUE)
}
if (!dir.exists(scratch_dir)) {
dir.create(scratch_dir)
}
if (!dir.exists(cosmicv2_scratch)) {
dir.create(cosmicv2_scratch)
}
if (!dir.exists(cosmicv3_scratch)) {
dir.create(cosmicv3_scratch)
}
if (!dir.exists(nature_scratch)) {
dir.create(nature_scratch)
}
# Declare file path for consensus file
consensus_file <- file.path(data_dir, "snv-consensus-plus-hotspots.maf.tsv.gz")
Read in the consensus MAF file.
# Read in the file
maf <- data.table::fread(consensus_file, data.table = FALSE)
Read in the histology colors and labels.
histology_label_mapping <- readr::read_tsv(
file.path(figures_dir, "palettes", "histology_label_color_table.tsv")
) %>%
# Select just the columns we will need for plotting
dplyr::select(Kids_First_Biospecimen_ID, display_group, display_order, hex_codes) %>%
# Reorder display_group based on display_order
dplyr::mutate(display_group = forcats::fct_reorder(display_group, display_order))
Rows: 2841 Columns: 11
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (9): Kids_First_Biospecimen_ID, sample_type, integrated_diagnosis, Notes...
dbl (2): n, display_order
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Set up gradient color palette for the bubble matrix plots.
gradient_col_palette <- readr::read_tsv(
file.path(figures_dir, "palettes", "gradient_color_palette.tsv")
)
Rows: 11 Columns: 2
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): color_names, hex_codes
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Won't need NA color this time.
gradient_col_palette <- gradient_col_palette %>%
dplyr::filter(color_names != "na_color")
Read in the metadata and set it up with the color palette. Separate BS_ids into intial CNS tumors and other tumors, as mutational signatures analyses will be run on a different set of signatures for each group.
metadata_df <- readr::read_tsv(file.path(data_dir, "histologies.tsv"), guess_max = 10000) %>%
# dplyr::select("Kids_First_Biospecimen_ID", "experimental_strategy") %>%
dplyr::inner_join(histology_label_mapping, by = "Kids_First_Biospecimen_ID") %>%
dplyr::rename(Tumor_Sample_Barcode = "Kids_First_Biospecimen_ID")
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)
Rows: 43729 Columns: 55
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (38): Kids_First_Biospecimen_ID, sample_id, aliquot_id, Kids_First_Parti...
dbl (17): age_at_diagnosis_days, OS_days, EFS_days, age_last_update_days, no...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
initial_pretx_tumors <- metadata_df %>%
filter(tumor_descriptor == "Initial CNS Tumor" & age_at_diagnosis_days < age_at_chemo_start) %>%
pull(Tumor_Sample_Barcode)
# pretx_tumors <- readr::read_tsv(file.path(input_dir, "CBTN_all_therapy_20220922.tsv")) %>%
# mutate(Pretreatment = case_when(`Age at Diagnosis` < `Age at Chemotherapy Start` ~ "yes",
# `Age at Diagnosis` > `Age at Chemotherapy Start` ~ "no",
# TRUE ~ as.character(NA))) %>%
# filter(Pretreatment == "yes") %>%
# pull(cohort_participant_id)
#
# initial_pretx_tumors <- metadata_df %>%
# filter(tumor_descriptor == "Initial CNS Tumor" & cohort_participant_id %in% pretx_tumors) %>%
# pull(Tumor_Sample_Barcode)
Read in tmb-all file with WGS and WXS region lengths so they can be used for the Mb denominator.
# Set up BED region file for TMB calculations
region_sizes <- readr::read_tsv(file.path(data_dir, "snv-mutation-tmb-all.tsv")) %>%
dplyr::select(Tumor_Sample_Barcode, region_size)
Rows: 2766 Columns: 5
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): Tumor_Sample_Barcode, experimental_strategy
dbl (3): mutation_count, region_size, tmb
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Determine how many mutations we have per sample.
mut_per_sample <- maf %>%
dplyr::group_by(Tumor_Sample_Barcode) %>%
dplyr::tally() %>%
dplyr::arrange(n)
summary(mut_per_sample$n)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 125.2 989.0 3567.7 2101.8 1077834.0
Graph this.
ggplot2::ggplot(mut_per_sample, ggplot2::aes(x = n, geom = "density")) +
ggplot2::geom_density() +
ggplot2::theme_classic()
Make mutation data into deconstructSigs input
format.
# Convert to deconstructSigs input
sigs_input <- mut.to.sigs.input(
mut.ref = maf,
sample.id = "Tumor_Sample_Barcode",
chr = "Chromosome",
pos = "Start_Position",
ref = "Reference_Allele",
alt = "Allele",
bsg = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
)
Warning in mut.to.sigs.input(mut.ref = maf, sample.id = "Tumor_Sample_Barcode", : Some samples have fewer than 50 mutations:
BS_02QV7ZWP, BS_1B00Q25Y, BS_3DV5FVPQ, BS_4SCDT7M9, BS_7KR13R3P, BS_91HP99HY, BS_BTDQZ60Q, BS_GQFPB8F3, BS_KHJMCFQR, BS_KT3GAAQV, BS_P7VR731D, BS_QF7M4SHH, BS_ST7KGV85, BS_W60PB1DS, BS_X0KN4VVW, BS_YGDHJ68W, BS_Z8GRJ71M, BS_GJ2KS7BT, BS_08QFCDXB, BS_0W0G7PYS, BS_026TDAF1, BS_09983DA9, BS_1V616KKY, BS_1XT92Z85, BS_1ZX8B4H6, BS_2HE0SGCP, BS_33FDZ9QY, BS_2WEA24FV, BS_2W1D018F, BS_3CW6E0W8, BS_37P1RE8J, BS_3FMM6P58, BS_3KHY7S7N, BS_39JWYN4D, BS_41Q1D6PV, BS_443KSJR9, BS_473TYSNE, BS_4882NV0Q, BS_4A55PHAE, BS_4CDQ0JJ1, BS_4CGJYHHN, BS_4EY5V954, BS_557R1AGJ, BS_6CKTRR1X, BS_75BBX7JP, BS_6TR1TNS7, BS_790WPA1C, BS_85EWSYGV, BS_7PF27EEG, BS_88GET7GV, BS_8QR5CHV7, BS_9EFHG5PZ, BS_981ME2R1, BS_9N1V6VN2, BS_9PEXXJMK, BS_9X5AABEM, BS_A6E869SH, BS_ADMV7AV5, BS_A0R18CS1, BS_AM3NM05Y, BS_AT709G19, BS_AYDXB2NA, BS_AVXJSWEN, BS_AZ7C646B, BS_B05V3JN7, BS_B64J2XCQ, BS_BAD2CHGE, BS_BJ6GHH0W, BS_BQK1JF37, BS_BYXPP48M, BS_CVCR0QTK, BS_D0QNVE94, BS_DJ05Y3P6, BS_DSAGRKDR, BS_DVPGAFPS, BS_E6RGTGE7, BS_E9CF9YSP, BS_E6V9F7GA, BS_EBPF8GTD, BS_ENJZQ5F6, BS_FFZGMSMQ, BS_ENP5SS80, BS_FWVSVP0T, BS_G6W2T7CV, BS_GQN8GB9K, BS_GZJADN7T, BS_H7AWWS55, BS_GVZPVW85, BS_HWT4F4Z1, BS_HRAPJC76, BS_J8CDEQ26, BS_K59BPPX3, BS_KQM96PHV, BS_KF7X1WDR, BS_KY0N6ZCY, BS_N9KWJSW0, BS_MCB9QYD6, BS_M3KA4SSE, BS_N9MHYWDZ, BS_NBW2EEMA, BS_NVB25WVY, BS_NZX1S9QT, BS_P65W011T, BS_PHVX4NQ6, BS_PPC82HCS, BS_Q2R56X78, BS_Q9MMJ9H4, BS_QF0WQY17, BS_QRTBPSN6, BS_QV60J6XZ, BS_RPRNKMCQ, BS_RK6YX3RE, BS_RRN8YDWK, BS_RRFV4R8C, BS_SDKP0388, BS_SEFYKVEW, BS_SQ2FFGA1, BS_SYQ1GRJA, BS_T1NQGNEX, BS_TSCE1B96, BS_TW4ZGQ3E, BS_T2ASQ6ZF, BS_TE7Q5QWK, BS_V1BNA817, BS_V54KSGNF, BS_VEK54PE0, BS_VGTP4Z43, BS_VGXWS93R, BS_WV0GH7EN, BS_XA22ZVF2, BS_W3AK2Q14, BS_XE7YZD41, BS_YP0K170J, BS_YRVPJSEG, BS_YZ22MXW3, BS_Z0C4H94E, BS_Z8X2WQFH, TARGET-10-PATZFF-09A-01D, TARGET-50-PALJIP-01A-01D, TARGET-10-PASJMK-09A-01D, TARGET-10-PARHES-09A-01D, TARGET-10-PATWXC-09A-01D, TARGET-10-PATDMN-09A-01D, TARGET-10-PASSPP-09A-01D, TARGET-10-PARWID-09A-01D, TARGET-10-PASXLT-09A-01D, TARGET-10-PARASZ-09A-01D, TARGET-10-PANYZE-09A-01D, TARGET-10-PASZJW-03A-01D, TARGET-10-PAPZNK-03A-01D, TARGET-10-PATSDS-03A-01D, TARGET-10-PATGWP-09A-01D, TARGET-10-PATCZN-09A-01D, TARGET-10-PARJXW-09A-01D, TARGET-10-PARXLS-09A-01D, TARGET-10-PAUBXP-09A-01D, TARGET-10-PATZYC-09A-01D, TARGET-50-PAKFYV-01A-01D, TARGET-10-PASHXL-09A-01D, TARGET-10-PASXUU-09A-01D, TARGET-50-PAJPCM-01A-01D, TARGET-50-PAJNRH-01A-01D, TARGET-10-PASHNK-09A-01D, TARGET-10-PATLPN-03A-01D, TARGET-10-PARMIH-09A-01D, TARGET-50-PAKXWB-01A-01D, TARGET-50-PAJNNC-01A-01D, TARGET-10-PATYWV-09A-01D, TARGET-10-PARYGI-03A-01D, TARGET-10-PASKTG-09A-01D, TARGET-40-PASKZZ-01A-01D, TARGET-10-PATSLH-09A-01D, TARGET-10-PAPDUV-09A-01D, TARGET-10-PARNXJ-09A-01D, TARGET-10-PARPYJ-09A-01D, TARGET-10-PARGFL-09A-01D, TARGET-10-PAUBLL-09A-01D, TARGET-50-PAJMEP-01A-01D, TARGET-40-PASYUK-01A-01D, TARGET-50-PAKPDF-01A-01D, TARGET-10-PASILW-09A-01D, TARGET-10-PASIIY-09A-01D, TARGET-10-PAPHED-09A-01D, TARGET-30-PATNPW-01A-01D, TARGET-30-PASKRS-01A-01D, TARGET-30-PATSKE-01A-01D, TARGET-30-PAUTVX-01A-01D, TARGET-30-PAMMXF-01A-01W, TARGET-30-PATSJV-01A-01D, TARGET-10-PARBCW-09A-01D, TARGET-30-PATBJI-01A-01D, TARGET-10-PARBGG-09A-01D, TARGET-30-PAURZH-01A-01D, TARGET-30-PAUZTF-01A-01D, TARGET-30-PASDDP-01A-01D, TARGET-10-PANXEE-09A-01D, TARGET-30-PAUPDY-01A-01D, TARGET-30-PATUNX-01A-01D, TARGET-30-PAUGIP-01A-01D, TARGET-10-PANXCX-09A-01D, TARGET-30-PATYDC-01A-01D, TARGET-30-PAUJTY-01A-01D, TARGET-30-PAUKRF-01A-01D, TARGET-30-PATRXC-01A-01D, TARGET-40-PALKGN-01A-01D, TARGET-30-PAUIWS-01A-01D, TARGET-10-PANVAW-09A-01D, TARGET-30-PASGNT-01A-01D, TARGET-30-PAPBJT-01A-01W, TARGET-30-PATRHD-01A-01D, TARGET-30-PASRWE-01A-01D, TARGET-30-PATYMK-01A-01D, TARGET-30-PAUUZU-01A-01D, TARGET-30-PARVWL-01A-01D, TARGET-30-PATAAV-01A-01D, TARGET-30-PASRSG-01A-01D, TARGET-30-PARTCE-01A-01D, TARGET-10-PANZZI-09A-01D, TARGET-30-PAVALS-01A-01D, TARGET-30-PAUXUP-01A-01D, TARGET-30-PAUMXC-01A-01D, TARGET-10-PARJMY-09A-01D, TARGET-30-PAUBPW-09A-01D, TARGET-10-PAPHRV-09A-01D, TARGET-30-PASUYL-01A-01D, TARGET-30-PASAFG-01A-01D, TARGET-30-PATUPR-09A-01D, TARGET-10-PARGGI-09A-01D, TARGET-10-PAPBCK-09A-01D, TARGET-30-PATHJU-09A-01D, TARGET-30-PAVEZM-01A-01D, TARGET-30-PATFTR-01A-01D, TARGET-30-PASMUB-01A-01D, TARGET-30-PASLRM-01A-01D, TARGET-30-PASZST-01A-01D, TARGET-30-PAUELT-01A-01D, TARGET-30-PAUDDZ-01A-01D, TARGET-30-PARHDE-01A-01D, TARGET-30-PATBAC-01A-01D, TARGET-30-PASALE-01A-01D, TARGET-30-PASPGU-01A-01D, TARGET-30-PASWVE-01A-01D, TARGET-30-PAICGF-01A-01W, TARGET-10-PANTLF-03A-01D, TARGET-30-PAMCXF-01A-01W, TARGET-30-PATCDJ-01A-01D, TARGET-30-PASGCD-01A-01D, TARGET-30-PASKZT-01A-01D, TARGET-30-PATYMZ-01A-01D, TARGET-30-PASZYY-01A-01D, TARGET-30-PAIXNC-01A-01W, TARGET-30-PASBDN-01A-01D, TARGET-20-PANSBH-04A-01D, TARGET-10-PARLMI-09A-01D, TARGET-30-PAKIPY-01A-01W, TARGET-30-PAUBDC-01A-01D, TARGET-30-PAUZSB-01A-01D, TARGET-30-PAUYXX-01A-01D, TARGET-30-PAUKNU-01A-01D, TARGET-30-PASJTA-01A-01D, TARGET-10-PARKFN-09A-01D, TARGET-30-PAUNWR-01A-01D, TARGET-30-PAUGZD-01A-01D, TARGET-30-PALETP-01A-01W, TARGET-30-PAUDPP-01A-01D, TARGET-30-PAUJLH-01A-01D, TARGET-30-PASYTP-01A-01D, TARGET-30-PATAFE-01A-01D, TARGET-30-PAUMBB-01A-01D, TARGET-10-PARATY-09A-01D, TARGET-30-PASKKZ-01A-01D, TARGET-10-PANVIC-03A-01D, TARGET-30-PAKHCF-01A-01W, TARGET-30-PAUYDE-01A-01D, TARGET-30-PASAJY-01A-01D, TARGET-30-PASJWG-01A-01D, TARGET-30-PASAAN-01A-01D, TARGET-30-PATLCM-01A-01D, TARGET-30-PATXXI-01A-01D, TARGET-30-PATVWA-01A-01D, TARGET-30-PASJWA-01A-01D, TARGET-10-PAPADT-09A-01D, TARGET-30-PASMET-01A-01D, TARGET-30-PAUHYY-01A-01D, TARGET-30-PARYVD-01A-01D, TARGET-30-PAUBSW-01A-01D, TARGET-30-PASLPG-01A-01D, TARGET-30-PAUGRP-01A-01D, TARGET-30-PASCZY-01A-01D, TARGET-10-PAPHBW-09A-01D, TARGET-10-PAPAGS-09A-01D, TARGET-30-PATNRI-01A-01D, TARGET-30-PASCEW-09A-01D, TARGET-30-PARUGX-01A-01D, TARGET-30-PAPEFE-01A-01W, TARGET-30-PATUEH-01A-01D, TARGET-30-PAIXNV-01A-01W, TARGET-30-PAULVH-01A-01D, TARGET-30-PATXUG-01A-01D, TARGET-10-PANSYA-09A-01D, TARGET-30-PASLMN-01A-01D, TARGET-30-PAVCKK-01A-01D, TARGET-30-PAKVUY-01A-01D, TARGET-10-PARCKV-09A-01D, TARGET-10-PAPGEE-09A-01D, TARGET-10-PARCTN-09A-01D, TARGET-30-PATTFB-01A-01D, TARGET-30-PATBKX-01A-01D, TARGET-10-PAPIHU-09A-01D, TARGET-30-PATIYD-09A-01D, TARGET-30-PASZJB-01A-01D, TARGET-30-PATXKG-01A-01D, TARGET-30-PATFTN-01A-01D, TARGET-10-PAPDDA-09A-01D, TARGET-10-PAPCNP-09A-01D, TARGET-30-PASLTC-01A-01D, TARGET-30-PATLUI-01A-01D, TARGET-30-PARVVM-09A-01D, TARGET-30-PASVWG-01A-01D, TARGET-30-PASTIJ-01A-01D, TARGET-30-PASFKX-01A-01D, TARGET-10-PARDKG-09A-01D, TARGET-30-PATBPG-01A-01D, TARGET-30-PASSEC-01A-01D, TARGET-30-PAUXFZ-01A-01D, TARGET-30-PATDFU-01A-01D, TARGET-10-PARBRX-09A-01D, TARGET-30-PALJUV-01A-01W, TARGET-30-PATSDR-01A-01D, TARGET-10-PARMMA-09A-01D, TARGET-10-PARCMG-03A-01D, TARGET-10-PAPCVI-09A-01D, TARGET-30-PASEGF-01A-01D, TARGET-30-PAUBRR-01A-01D, TARGET-10-PAPAGK-03A-01D, TARGET-30-PATZIG-01A-01D, TARGET-10-PAPIJD-09A-01D, TARGET-50-PAJLWT-01A-01D, TARGET-50-PAJLNJ-01A-01D, TARGET-50-PAJNLT-01A-01D, TARGET-10-PASMIC-09A-01D, TARGET-10-PARAYM-09A-01D, TARGET-10-PATDBU-09A-01D, TARGET-10-PASMNV-09A-01D, TARGET-10-PATNIA-09A-01D, TARGET-50-PAJPAR-01A-01D, TARGET-40-PARFTG-01A-01D, TARGET-10-PASZEW-09A-01D, TARGET-30-PASXRJ-01A-01D, TARGET-30-PASKYH-01A-01D, TARGET-10-PAPHLH-09A-01D, TARGET-30-PATSRD-01A-01D, TARGET-30-PASPXU-01A-01D, TARGET-30-PAIXRK-01A-01W, TARGET-30-PATJXV-01A-01D, TARGET-30-PATBRX-01A-01D, TARGET-10-PAPERN-09A-01D, TARGET-30-PAUMMZ-01A-01D, TARGET-30-PASCFA-01A-01D, TARGET-10-PARBIX-03A-01D, TARGET-30-PARXMM-01A-01D, TARGET-30-PATMXC-01A-01D, TARGET-30-PATWGC-01A-01D, TARGET-30-PASLIH-01A-01D, TARGET-30-PAUWYM-01A-01D, TARGET-30-PAKGKH-01A-01W, TARGET-30-PAUICI-01A-01D, TARGET-30-PATPPK-01A-01D, TARGET-30-PANKFE-01A-01W, TARGET-30-PASFGD-01A-01D, TARGET-30-PAURCG-01A-01D, TARGET-30-PAUZMG-01A-01D, TARGET-30-PARXVA-01A-01D, TARGET-30-PARUPT-01A-01D, TARGET-30-PARXMH-01A-01D, TARGET-30-PASFNF-01A-01D, TARGET-30-PARYWX-01A-01D, TARGET-30-PASFDV-01A-01D, TARGET-30-PALZZV-01A-01W, TARGET-30-PAUBYW-01A-01D, TARGET-30-PASEAR-01A-01D, TARGET-30-PATUNZ-01A
Add total mutations per sample.
# Count the total number of signature mutations for each sample
total_muts <- apply(sigs_input, 1, sum)
Get list of tumor sample ids.
tumor_sample_ids <- maf %>%
dplyr::filter(Tumor_Sample_Barcode %in% rownames(sigs_input)) %>%
dplyr::distinct(Tumor_Sample_Barcode) %>%
dplyr::pull(Tumor_Sample_Barcode)
Get COSMIC v2 signatures for each sample. This step will take some time.
sample_sigs_cosmic <- lapply(tumor_sample_ids, function(sample_id) {
# Determine the signatures contributing to the sample
whichSignatures(
tumor.ref = sigs_input,
signatures.ref = signatures.cosmic,
sample.id = sample_id,
contexts.needed = TRUE
)
})
# Bring along the names
names(sample_sigs_cosmic) <- tumor_sample_ids
# Create matrix of COSMIC signature weights
cosmic_weights <- lapply(sample_sigs_cosmic, "[[", "weights")
cosmic_wide <- do.call(dplyr::bind_rows, cosmic_weights) %>%
add_column('Kids_First_Biospecimen_ID' = unlist(lapply(cosmic_weights, rownames)), .before = 1) %>%
tibble::as_tibble() %>%
readr::write_tsv(file.path(results_dir, 'cosmicv2_signature_exposure_matrix.tsv'))
Get Alexandrov et al, 2013 signatures for each sample.
sample_sigs_nature <- lapply(tumor_sample_ids, function(sample_id) {
# Determine the signatures contributing to the sample
whichSignatures(
tumor.ref = sigs_input,
signatures.ref = signatures.nature2013,
sample.id = sample_id,
contexts.needed = TRUE
)
})
# Bring along the names
names(sample_sigs_nature) <- tumor_sample_ids
# Create data frame of Nature signature weights
nature_weights <- lapply(sample_sigs_nature, "[[", "weights")
nature_wide <- do.call(dplyr::bind_rows, nature_weights) %>%
add_column('Kids_First_Biospecimen_ID' = unlist(lapply(nature_weights, rownames)), .before = 1) %>%
tibble::as_tibble() %>%
readr::write_tsv(file.path(results_dir, 'nature_signature_exposure_matrix.tsv'))
Get COSMIC genome v3.3 signatures for each sample.
signatures.cosmic.v3.3 <- read_tsv(file.path(input_dir, 'COSMIC_v3.3.1_SBS_GRCh38.txt')) %>%
column_to_rownames('Type') %>%
t %>%
as.data.frame()
Rows: 96 Columns: 80
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): Type
dbl (79): SBS1, SBS2, SBS3, SBS4, SBS5, SBS6, SBS7a, SBS7b, SBS7c, SBS7d, SB...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Create list of include/exclude signatures using SBSv3 mapping file. For Initial CNS tumors, therapy exposures will be excluded, but they will be retained for other tumor types.
map <- read_tsv(file.path(input_dir, "sbs_v3_map.tsv"))
Rows: 79 Columns: 4
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): Signature, Meaning, Broad_category, Narrow_category
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Excluded signatures:
artifact_sigs <- map %>%
filter(Broad_category == "Sequencing artifact") %>%
pull(Signature)
environ_sigs <- map %>%
filter(Broad_category == "Environmental exposure") %>%
pull(Signature)
unknown <- map %>%
filter(Broad_category == "Unknown" & Signature != "SBS39") %>%
pull(Signature)
therapy_sigs <- map %>%
filter(Broad_category == "Therapy exposure") %>%
pull(Signature)
exclude_sigs <- c(artifact_sigs, environ_sigs, unknown)
include_sigs_initialPretx <- setdiff(rownames(signatures.cosmic.v3.3), c(exclude_sigs, therapy_sigs))
include_sigs_other <- setdiff(rownames(signatures.cosmic.v3.3), exclude_sigs)
Run signature extraction. This step will take some time.
sample_sigs_cosmic_v33 <- lapply(tumor_sample_ids, function(sample_id) {
# Determine the signatures contributing to the sample
if (sample_id %in% initial_pretx_tumors){
whichSignatures(
tumor.ref = sigs_input,
signatures.ref = signatures.cosmic.v3.3,
sample.id = sample_id,
contexts.needed = TRUE,
associated = include_sigs_initialPretx
)
}else{
whichSignatures(
tumor.ref = sigs_input,
signatures.ref = signatures.cosmic.v3.3,
sample.id = sample_id,
contexts.needed = TRUE,
associated = include_sigs_other
)
}
})
# Bring along the names
names(sample_sigs_cosmic_v33) <- tumor_sample_ids
#
# Create matrix of COSMIC signature weights
cosmic_v33_weights <- lapply(sample_sigs_cosmic_v33, "[[", "weights")
cosmic_v33_wide <- do.call(dplyr::bind_rows, cosmic_v33_weights) %>%
add_column('Kids_First_Biospecimen_ID' = unlist(lapply(cosmic_v33_weights, rownames)), .before = 1) %>%
tibble::as_tibble() %>%
readr::write_tsv(file.path(results_dir, 'cosmicv3.3_signature_exposure_matrix.tsv'))
Do this for COSMIC v2 mutation signatures.
# Calculate mutations per signature
cosmic_sigs_df <- calc_mut_per_sig(
sample_sigs_cosmic,
muts_per_sample = total_muts,
region_size = region_sizes,
metadata = metadata_df
) %>%
dplyr::filter(grepl("Signature", signature))
Using Tumor_Sample_Barcode, sample_id, aliquot_id, Kids_First_Participant_ID, experimental_strategy, sample_type, composition, tumor_descriptor, primary_site, reported_gender, race, ethnicity, pathology_diagnosis, RNA_library, OS_status, cohort, seq_center, cancer_predispositions, pathology_free_text_diagnosis, cohort_participant_id, extent_of_tumor_resection, CNS_region, gtex_group, gtex_subgroup, germline_sex_estimate, clinical_status_at_event, cell_line_composition, dkfz_v11_methylation_subclass, dkfz_v12_methylation_subclass, dkfz_v12_methylation_mgmt_status, molecular_subtype, integrated_diagnosis, Notes, harmonized_diagnosis, molecular_subtype_methyl, broad_histology, short_histology, cancer_group, display_group, hex_codes as id variables
Warning: attributes are not identical across measure variables; they will be
dropped
# Write this to a file but drop the color column
cosmic_sigs_df %>%
dplyr::select(-hex_codes) %>%
readr::write_tsv(file.path(results_dir, "cosmicv2_signatures_results.tsv"))
# Print out a preview
cosmic_sigs_df
Do this for COSMIC v3.3 mutation signatures.
# Calculate mutations per signature
cosmicv3_sigs_df <- calc_mut_per_sig(
sample_sigs_cosmic_v33,
muts_per_sample = total_muts,
region_size = region_sizes,
metadata = metadata_df
) %>%
dplyr::filter(grepl("SBS", signature))
Using Tumor_Sample_Barcode, sample_id, aliquot_id, Kids_First_Participant_ID, experimental_strategy, sample_type, composition, tumor_descriptor, primary_site, reported_gender, race, ethnicity, pathology_diagnosis, RNA_library, OS_status, cohort, seq_center, cancer_predispositions, pathology_free_text_diagnosis, cohort_participant_id, extent_of_tumor_resection, CNS_region, gtex_group, gtex_subgroup, germline_sex_estimate, clinical_status_at_event, cell_line_composition, dkfz_v11_methylation_subclass, dkfz_v12_methylation_subclass, dkfz_v12_methylation_mgmt_status, molecular_subtype, integrated_diagnosis, Notes, harmonized_diagnosis, molecular_subtype_methyl, broad_histology, short_histology, cancer_group, display_group, hex_codes as id variables
Warning: attributes are not identical across measure variables; they will be
dropped
# Write this to a file but drop the color column
cosmicv3_sigs_df %>%
# dplyr::filter(grepl("SBS", signature)) %>%
dplyr::select(-hex_codes) %>%
readr::write_tsv(file.path(results_dir, "cosmicv3.3_signatures_results.tsv"))
# Print out a preview
cosmicv3_sigs_df
Do this for Alexandrov et al, 2013 mutation signatures.
#Calculate mutations per signature
nature_sigs_df <- calc_mut_per_sig(
sample_sigs_nature,
muts_per_sample = total_muts,
region_size = region_sizes,
metadata = metadata_df
) %>%
dplyr::filter(grepl("Signature", signature))
Using Tumor_Sample_Barcode, sample_id, aliquot_id, Kids_First_Participant_ID, experimental_strategy, sample_type, composition, tumor_descriptor, primary_site, reported_gender, race, ethnicity, pathology_diagnosis, RNA_library, OS_status, cohort, seq_center, cancer_predispositions, pathology_free_text_diagnosis, cohort_participant_id, extent_of_tumor_resection, CNS_region, gtex_group, gtex_subgroup, germline_sex_estimate, clinical_status_at_event, cell_line_composition, dkfz_v11_methylation_subclass, dkfz_v12_methylation_subclass, dkfz_v12_methylation_mgmt_status, molecular_subtype, integrated_diagnosis, Notes, harmonized_diagnosis, molecular_subtype_methyl, broad_histology, short_histology, cancer_group, display_group, hex_codes as id variables
Warning: attributes are not identical across measure variables; they will be
dropped
# Write this to a file but drop the color column
nature_sigs_df %>%
# dplyr::filter(grepl("SBS", signature)) %>%
dplyr::select(-hex_codes) %>%
readr::write_tsv(file.path(results_dir, "nature_signatures_results.tsv"))
# Print out a preview
nature_sigs_df
For COSMIC v2 signatures
bubble_matrix_plot(cosmic_sigs_df,
label = "COSMIC Signatures",
color_palette = gradient_col_palette$hex_codes
)
`summarise()` has grouped output by 'display_group'. You can override using the
`.groups` argument.
Warning: Removed 207 rows containing missing values (`geom_point()`).
ggplot2::ggsave(
file.path(cosmicv2_plots, "bubble_matrix_cosmicv2_mutation_sig.png"),
width = 30, height = 20, units = "cm")
Warning: Removed 207 rows containing missing values (`geom_point()`).
For Nature signatures
bubble_matrix_plot(nature_sigs_df,
label = "Alexandrov et al, 2013 signatures",
color_palette = gradient_col_palette$hex_codes)
`summarise()` has grouped output by 'display_group'. You can override using the
`.groups` argument.
Warning: Removed 151 rows containing missing values (`geom_point()`).
ggplot2::ggsave(
file.path(nature_plots, "bubble_matrix_nature_mutation_sig.png"),
width = 30, height = 20, units = "cm")
Warning: Removed 151 rows containing missing values (`geom_point()`).
For COSMIC v3.3 signatures
bubble_matrix_plot(cosmicv3_sigs_df,
label = "COSMIC Signatures",
color_palette = gradient_col_palette$hex_codes
)
`summarise()` has grouped output by 'display_group'. You can override using the
`.groups` argument.
Warning: Removed 986 rows containing missing values (`geom_point()`).
ggplot2::ggsave(
file.path(cosmicv3_plots, "bubble_matrix_cosmicv3_mutation_sig.png"),
width = 30, height = 20, units = "cm")
Warning: Removed 986 rows containing missing values (`geom_point()`).
We will make these plots for primary tumor samples only. Lets make these for COSMIC mutation signatures first.
# Make grouped bar plots
lapply(unique(cosmic_sigs_df$display_group),
grouped_sig_barplot,
sig_num_df = cosmic_sigs_df,
output_dir = file.path(cosmicv2_scratch, "signature_grouped_barplots"),
label = "cosmic_v2"
)
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Make these plots for Alexandrov et al, 2013 signatures.
# Make grouped bar plots
lapply(unique(nature_sigs_df$display_group),
grouped_sig_barplot,
sig_num_df = nature_sigs_df,
output_dir = file.path(nature_scratch, "signature_grouped_barplots"),
label = "nature"
)
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Make these plots for COSMIC v3.3 signatures.
# Make grouped bar plots
lapply(unique(cosmicv3_sigs_df$display_group),
grouped_sig_barplot,
sig_num_df = cosmicv3_sigs_df,
output_dir = file.path(cosmicv3_scratch, "signature_grouped_barplots"),
label = "cosmic_v3.3"
)
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0
[4] dplyr_1.1.1 purrr_1.0.1 readr_2.1.4
[7] tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.2
[10] tidyverse_2.0.0 deconstructSigs_1.9.0
loaded via a namespace (and not attached):
[1] Biobase_2.58.0 MatrixGenerics_1.10.0
[3] sass_0.4.5 bit64_4.0.5
[5] vroom_1.6.1 jsonlite_1.8.4
[7] R.utils_2.12.2 bslib_0.4.2
[9] highr_0.10 stats4_4.2.3
[11] BSgenome_1.66.3 GenomeInfoDbData_1.2.9
[13] Rsamtools_2.14.0 yaml_2.3.7
[15] lattice_0.21-8 pillar_1.9.0
[17] glue_1.6.2 digest_0.6.31
[19] GenomicRanges_1.50.2 XVector_0.38.0
[21] colorspace_2.1-0 plyr_1.8.8
[23] Matrix_1.5-4 htmltools_0.5.5
[25] R.oo_1.25.0 XML_3.99-0.14
[27] pkgconfig_2.0.3 zlibbioc_1.44.0
[29] scales_1.2.1 tzdb_0.3.0
[31] BiocParallel_1.32.6 timechange_0.2.0
[33] generics_0.1.3 farver_2.1.1
[35] IRanges_2.32.0 SummarizedExperiment_1.28.0
[37] cachem_1.0.7 withr_2.5.0
[39] BiocGenerics_0.44.0 cli_3.6.1
[41] magrittr_2.0.3 crayon_1.5.2
[43] evaluate_0.20 R.methodsS3_1.8.2
[45] fansi_1.0.4 textshaping_0.3.6
[47] tools_4.2.3 data.table_1.14.8
[49] hms_1.1.3 BiocIO_1.8.0
[51] lifecycle_1.0.3 matrixStats_0.63.0
[53] S4Vectors_0.36.2 munsell_0.5.0
[55] DelayedArray_0.24.0 Biostrings_2.66.0
[57] compiler_4.2.3 jquerylib_0.1.4
[59] GenomeInfoDb_1.34.9 systemfonts_1.0.4
[61] rlang_1.1.0 grid_4.2.3
[63] RCurl_1.98-1.12 rjson_0.2.21
[65] bitops_1.0-7 labeling_0.4.2
[67] rmarkdown_2.21 restfulr_0.0.15
[69] gtable_0.3.3 codetools_0.2-19
[71] reshape2_1.4.4 R6_2.5.1
[73] GenomicAlignments_1.34.1 knitr_1.42
[75] rtracklayer_1.58.0 fastmap_1.1.1
[77] bit_4.0.5 utf8_1.2.3
[79] ragg_1.2.5 stringi_1.7.12
[81] Rcpp_1.0.10 parallel_4.2.3
[83] vctrs_0.6.2 tidyselect_1.2.0
[85] xfun_0.38 BSgenome.Hsapiens.UCSC.hg38_1.4.5